In a prior post (LINK HERE), I began this project: a bit of a personal workshop in learning how to use Stan for spatial modeling. In that post, I did some exploratory data analysis to set up the overarching idea of the project, which is to (a) follow the example in LINK PAPER and use an ICAR model to model spatial dependencies in crashes across NYC and (b) potentially add a treatment/difference in difference component to this model to understand whether the congestion pricing policy has an effect on the number of vehicle crashes. In the interest of breaking this project into digestible chunks, I have decided that this, part 2 of 3, will focus only on the spatial modeling part. To that end, I’ll be basically following this GUIDE from Mitzi Morris.
MENTION INTERPOLATION
# Loading NYC Census Tracts: https://data.cityofnewyork.us/City-Government/2020-Census-Tracts/63ge-mke6/about_data
nyc_tracts <- read.csv("data/2020_Census_Tracts_20250606.csv", stringsAsFactors = FALSE) %>%
rename("geometry" = "the_geom")
nyc_tracts <- st_as_sf(nyc_tracts, wkt = "geometry", crs = 4326)
nyc_tracts <- nyc_tracts %>%
mutate(
area_sqm = Shape_Area / 27878400
) %>%
select(
geometry,
BoroCT2020,
BoroCode,
BoroName,
area_sqm,
GEOID
)
# Loading collisions data: https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95/about_data
filter <- c(0, NA)
crashes <- read.csv("data/Motor_Vehicle_Collisions_-_Crashes_20250605.csv") %>%
select(CRASH.DATE,
LATITUDE,
LONGITUDE,
NUMBER.OF.PERSONS.INJURED,
NUMBER.OF.PERSONS.KILLED,
NUMBER.OF.PEDESTRIANS.INJURED,
NUMBER.OF.PEDESTRIANS.KILLED) %>%
filter(! LATITUDE %in% filter,
! LONGITUDE %in% filter) %>%
rename(
"date" = "CRASH.DATE",
"lat" = "LATITUDE",
"long" = "LONGITUDE",
"persons_inj" = "NUMBER.OF.PERSONS.INJURED",
"persons_death" = "NUMBER.OF.PERSONS.KILLED",
"ped_inj" = "NUMBER.OF.PEDESTRIANS.INJURED",
"ped_death" = "NUMBER.OF.PEDESTRIANS.KILLED"
) %>%
mutate(
date = mdy(date)
) %>%
st_as_sf(coords = c("long","lat"), crs = 4326)
# Grabbing some census variables via Tidycensus, using Keyring for API Key privacy
tidycensus_api_key <- key_get(service = "tidycensus_API", username = "my_tidycensus")
census_api_key(tidycensus_api_key)
census_vars <- get_acs(state = "NY",
county = c("Bronx", "Kings", "New York", "Queens", "Richmond"),
geography = "tract",
variables = c(medincome = "B19013_001",
population = "B01003_001",
median_age = "B01002_001",
transport_baseline = "B08301_001",
transport_pubtransit = "B08301_010"),
geometry = FALSE,
keep_geo_vars = FALSE,
year = 2022,
output = "wide"
) %>%
mutate(
GEOID = as.numeric(GEOID),
median_income = medincomeE,
population = populationE,
median_age = median_ageE,
prop_pubtransit = transport_pubtransitE / transport_baselineE
) %>%
select(
GEOID,
median_income,
population,
median_age,
prop_pubtransit
)
# Associating CP Zone, Pre/Post, Treatment, Borough, and Census Tract w/ Observations
crashes <- crashes %>%
st_join(nyc_tracts, join = st_within) %>%
filter(! is.na(area_sqm))
# Aggregating/summarizing data to census tract level, no time component
crashes_tract <- crashes %>%
group_by(BoroCT2020) %>%
summarize(tot_crashes = n(),
area = mean(area_sqm)) %>%
ungroup() %>%
st_drop_geometry()
# Getting Fragmentation Index from: https://github.com/mitzimorris/geomed_2024/blob/main/data/nyc_study.geojson
frag_data = st_read(file.path("data", "nyc_study.geojson"), quiet = TRUE) %>%
st_drop_geometry() %>%
select(BoroCT2010, frag_index, traffic) %>%
mutate(
BoroCT2010 = as.numeric(BoroCT2010)
)
# Joining everything together, selecting only variables that I want
crashes_tracts_geo <- nyc_tracts %>%
right_join(crashes_tract,
by = "BoroCT2020") %>%
left_join(census_vars,
by = "GEOID") %>%
left_join(frag_data,
by = c("BoroCT2020" = "BoroCT2010")) %>%
select(! c(area)) %>%
filter(! if_any(everything(), is.na))
# Interactive Data Table for Display
crashes_dt <- crashes_tracts_geo %>%
st_drop_geometry() %>%
mutate(
area_sqm = round(area_sqm, 3),
prop_pubtransit = round(prop_pubtransit, 3)
)
datatable(crashes_dt,
extensions = 'Buttons',
filter = "top",
rownames = FALSE,
options = list(
autoWidth = TRUE,
scrollX = TRUE
),
class = 'compact',
escape = FALSE
) %>%
formatStyle(
columns = names(crashes_dt),
`white-space` = "nowrap",
`height` = "1.5em",
`line-height` = "1.5em"
)
ggplot(data = crashes_tracts_geo, aes(x = tot_crashes)) +
geom_histogram(aes(y = after_stat(density)),
bins = 500,
color = "lightblue",
fill = "lightblue",
alpha = 0.75) +
geom_density(color = "#FFC600", linewidth = 1) +
theme_bw() +
labs(title = "Distribution Total Crashes per Census Tract (2024-25)",
x = "Number of Crashes",
y = "Density")
crash_map <- tm_shape(crashes_tracts_geo) +
tm_polygons(fill = "tot_crashes",
fill.scale = tm_scale_intervals(values = "brewer.yl_or_rd",
style = "jenks"),
fill.legend = tm_legend(title = "Total Crashes, 2024-25"),
lwd = 0.3) +
tm_layout(legend.outside = TRUE,
legend.outside.position = "right")
crash_map
income_map <- tm_shape(crashes_tracts_geo) +
tm_polygons(fill = "median_income",
fill.scale = tm_scale_intervals(values = "brewer.greens",
style = "jenks"),
fill.legend = tm_legend(title = "Median Income"),
lwd = 0.3) +
tm_layout(legend.outside = TRUE,
legend.outside.position = "right")
medianage_map <- tm_shape(crashes_tracts_geo) +
tm_polygons(fill = "median_age",
fill.scale = tm_scale_intervals(values = "brewer.blues",
style = "jenks"),
fill.legend = tm_legend(title = "Median Age"),
lwd = 0.3) +
tm_layout(legend.outside = TRUE,
legend.outside.position = "right")
tmap_arrange(income_map,
medianage_map,
nrow = 1, ncol = 2)
prop_pubtransit_map <- tm_shape(crashes_tracts_geo) +
tm_polygons(fill = "prop_pubtransit",
fill.scale = tm_scale_intervals(values = "brewer.purples",
style = "jenks",
value.na = "grey"),
fill.legend = tm_legend(title = "Proportion that Commute Using Public Transit"),
lwd = 0.3) +
tm_layout(legend.outside = TRUE,
legend.outside.position = "right")
pop_per_sqm_map <- tm_shape(crashes_tracts_geo) +
tm_polygons(fill = "population",
fill.scale = tm_scale_intervals(values = "brewer.yl_gn",
style = "jenks"),
fill.legend = tm_legend(title = "Population"),
lwd = 0.3) +
tm_layout(legend.outside = TRUE,
legend.outside.position = "right")
tmap_arrange(prop_pubtransit_map,
pop_per_sqm_map,
nrow = 1, ncol = 2)
frag_map <- tm_shape(crashes_tracts_geo) +
tm_polygons(fill = "frag_index",
fill.scale = tm_scale_intervals(values = "brewer.pu_or",
style = "quantile",
midpoint = 0,
value.na = "grey"),
fill.legend = tm_legend(title = "Fragmentation Index"),
lwd = 0.3) +
tm_layout(legend.outside = TRUE,
legend.outside.position = "right")
traffic_map <- tm_shape(crashes_tracts_geo) +
tm_polygons(fill = "traffic",
fill.scale = tm_scale_intervals(values = "brewer.reds",
style = "quantile",
midpoint = 0,
value.na = "grey"),
fill.legend = tm_legend(title = "Traffic"),
lwd = 0.3) +
tm_layout(legend.outside = TRUE,
legend.outside.position = "right")
tmap_arrange(frag_map,
traffic_map,
nrow = 1, ncol = 2)
nyc_nbs = poly2nb(crashes_tracts_geo)
nyc_coords = st_coordinates(st_centroid(crashes_tracts_geo['geometry']))
# Create a spatial points sf object for centroids
centroids_sf <- st_as_sf(data.frame(x = nyc_coords[,1], y = nyc_coords[,2]),
coords = c("x", "y"),
crs = st_crs(crashes_tracts_geo))
# Plot with tmap
spdep_map <- tm_shape(crashes_tracts_geo) +
tm_polygons(col = "white") +
tm_lines(col = "white", lwd = 2) +
# Add points for centroids
tm_shape(centroids_sf) +
tm_dots(size = 0.25, col = "black") +
# Add lines for neighbors
tm_shape(st_as_sf(nb2lines(nyc_nbs, coords = nyc_coords))) +
tm_lines(col = "deepskyblue3")
spdep_map
In the output below, we can see that there is 1 region with 0 links (an island), and 4 disjoint subgraphs, meaning networks that there are no paths between (Staten Island is a disjoint subgraph). This was done using the Queen’s case method (think of the squares that a Queen can reach on a chessboard–not only directly bordering neighbors, but diagonals as well.
Morris also recommends checking the rook’s case, so I’m outputing the results from both below:
nyc_nbs_rook = poly2nb(crashes_tracts_geo, queen=FALSE)
## Warning in poly2nb(crashes_tracts_geo, queen = FALSE): some observations have no neighbours;
## if this seems unexpected, try increasing the snap argument.
## Warning in poly2nb(crashes_tracts_geo, queen = FALSE): neighbour object has 8 sub-graphs;
## if this sub-graph count seems unexpected, try increasing the snap argument.
cat("*** Queen metric nb summary ***\n\n")
## *** Queen metric nb summary ***
summary(nyc_nbs)
## Neighbour list object:
## Number of regions: 1956
## Number of nonzero links: 10788
## Percentage nonzero weights: 0.2819702
## Average number of links: 5.515337
## 2 regions with no links:
## 259, 740
## 7 disjoint connected subgraphs
## Link number distribution:
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 2 18 53 143 286 466 464 314 149 39 12 6 2 2
## 18 least connected regions:
## 55 257 521 581 582 644 835 839 893 1210 1249 1402 1490 1576 1591 1605 1635 1893 with 1 link
## 2 most connected regions:
## 1718 1785 with 13 links
cat("\n\n*** Rook metric nb summary ***\n\n")
##
##
## *** Rook metric nb summary ***
summary(nyc_nbs_rook)
## Neighbour list object:
## Number of regions: 1956
## Number of nonzero links: 9052
## Percentage nonzero weights: 0.2365957
## Average number of links: 4.627812
## 3 regions with no links:
## 259, 740, 835
## 8 disjoint connected subgraphs
## Link number distribution:
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 3 23 77 283 513 599 311 92 38 9 5 2 1
## 23 least connected regions:
## 55 104 257 391 521 581 582 644 839 893 1210 1249 1402 1421 1453 1490 1518 1576 1591 1605 1635 1726 1893 with 1 link
## 1 most connected region:
## 1718 with 12 links
So…really not much difference. About 1 link, on average, separates the connections for any given tract using the Queen’s case vs. the Rook’s.
NEED TO ADD MORE LINKS! OR FIGURE OUT MISSINGNESS
Exactly what was needed in this department was a touch confusing to me. Essentially, I got the idea that (a) I should examine whether existing links in the graph made sense (e.g., if there’s a link across water, does it make sense to keep it in from a pedestrian-focused context?) and then (b) make sure the final graph is fully connected, because that’s a requirement of the ICAR model I want to use. In Morris’s example it seems that part (a) was mostly an exercise in spatial data science, whereas the actual dataset used for the final analysis is the original graph with all of its cross-water connections and added links to make it a single component. The code below achieves this, and checks my work.
# Function to add a symmetric edge
add_symmetric_edge_nb <- function(nb, i, j) {
nb_res = nb
if (!(j %in% nb[[i]])) {
if (nb[[i]][1] == 0) {
nb_res[[i]] = j
} else {
nb_res[[i]] <- sort(c(nb[[i]], j))
}
}
if (!(i %in% nb[[j]])) {
if (nb[[j]][1] == 0) {
nb_res[[j]] = i
} else {
nb_res[[j]] <- sort(c(nb[[j]], i))
}
}
return(nb_res)
}
nyc_nbs_connected = poly2nb(crashes_tracts_geo)
## Warning in poly2nb(crashes_tracts_geo): some observations have no neighbours;
## if this seems unexpected, try increasing the snap argument.
## Warning in poly2nb(crashes_tracts_geo): neighbour object has 7 sub-graphs;
## if this sub-graph count seems unexpected, try increasing the snap argument.
nyc_nbs_connected = add_symmetric_edge_nb(nyc_nbs_connected,
which(crashes_tracts_geo$BoroCT2020 == 3016400),
which(crashes_tracts_geo$BoroCT2020 == 5001800))
nyc_nbs_connected = add_symmetric_edge_nb(nyc_nbs_connected,
which(crashes_tracts_geo$BoroCT2020 == 4088400),
which(crashes_tracts_geo$BoroCT2020 == 4107201))
nyc_nbs_connected = add_symmetric_edge_nb(nyc_nbs_connected,
which(crashes_tracts_geo$BoroCT2020 == 4107201),
which(crashes_tracts_geo$BoroCT2020 == 4094201))
nyc_nbs_connected = add_symmetric_edge_nb(nyc_nbs_connected,
which(crashes_tracts_geo$BoroCT2020 == 4099801),
which(crashes_tracts_geo$BoroCT2020 == 4103202))
nyc_nbs_connected = add_symmetric_edge_nb(nyc_nbs_connected,
which(crashes_tracts_geo$BoroCT2020 == 2045600),
which(crashes_tracts_geo$BoroCT2020 == 2042600))
nyc_nbs_connected = add_symmetric_edge_nb(nyc_nbs_connected,
which(crashes_tracts_geo$BoroCT2020 == 1029900),
which(crashes_tracts_geo$BoroCT2020 == 2026900))
# Checking Work
spdep_map <- tm_shape(crashes_tracts_geo) +
tm_polygons(col = "white") +
tm_lines(col = "white", lwd = 2) +
# Add points for centroids
tm_shape(centroids_sf) +
tm_dots(size = 0.25, col = "black") +
# Add lines for neighbors
tm_shape(st_as_sf(nb2lines(nyc_nbs_connected, coords = nyc_coords))) +
tm_lines(col = "deepskyblue3")
## Warning in CRS(proj4string): CRS: projargs should not be NULL; set to NA
## Warning in CRS(proj4string): CRS: projargs should not be NULL; set to NA
spdep_map
cat('is symmetric? ', is.symmetric.nb(nyc_nbs_connected), '\n')
## is symmetric? TRUE
summary(nyc_nbs_connected)
## Neighbour list object:
## Number of regions: 1956
## Number of nonzero links: 10800
## Percentage nonzero weights: 0.2822839
## Average number of links: 5.521472
## 7 disjoint connected subgraphs
## Link number distribution:
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 19 52 144 287 461 469 313 150 39 12 6 2 2
## 19 least connected regions:
## 55 257 259 521 581 582 644 835 839 893 1210 1249 1402 1490 1576 1591 1605 1635 1893 with 1 link
## 2 most connected regions:
## 1718 1785 with 13 links
cat('nodes per component')
## nodes per component
table(n.comp.nb(nyc_nbs_connected)$comp.id)
##
## 1
## 1956
This is where this notebook gets long. I’m going to document my whole process for iteratively fitting better and better models using Stan.
poisson_model_file = file.path('stan', 'poisson_1.stan')
cat(readLines(poisson_model_file), sep="\n")
## data {
## int<lower=0> N;
## array[N] int<lower=0> y; // count outcomes
## vector<lower=0>[N] E; // exposure
## int<lower=1> K; // num covariates
## matrix[N, K] xs; // design matrix
## }
## transformed data {
## vector[N] log_E = log(E);
## }
## parameters {
## real beta0; // intercept
## vector[K] betas; // covariates
## }
## model {
## y ~ poisson_log(log_E + beta0 + xs * betas);
## beta0 ~ std_normal();
## betas ~ std_normal();
## }
## generated quantities {
## array[N] int y_rep;
## vector[N] log_lik;
## {
## vector[N] eta = log_E + beta0 + xs * betas;
## y_rep = max(eta) < 26 ? poisson_log_rng(eta) : rep_array(-1, N);
## for (n in 1:N) {
## log_lik[n] = poisson_log_lpmf(y[n] | eta[n]);
## }
## }
## }
So, this is not my model, it’s the simple Poisson model that Morris uses. So, rather than understanding it because I wrote it, I’m going to understand it because I explain it to you in this section. First, the data: 1. N: The number of observations (in this case, the number of census tracts) 2. y: The outcome measure, number of crashes in 2024-25 3. E: The “exposure” variable, which is the population of each tract. I believe this is separated from the rest of the predictor matrix, xs, so that it can be log transformed and then used as a separate component in the linear predictor. 4. K: The number of predictor variables 5. xs: A matrix containing N rows and K columns, which holds the predictor variables
The transformed data block takes the log of population (E), which is a generally good principle for data that’s constrained to be all positive. It also means that the coefficient goes to the multiplicative scale, which is nice for interpretation.
In the parameters block, Morris defines an intercept, beta0, and a vector of coefficients, betas, for the covariates. The beta0 coefficient is split out so it can be applied to the log_E data.
The model specifies that y comes from a Poisson distribution, and gives beta0 and betas simple standard normal distributions (mean 0, sd 1).
Lastly, the generated quantities block. In Morris’s words, this is
for use in model checking and model comparison. Y_rep is used to hold
replicated data. Morris uses some optimization in this block that I
don’t entirely (or even really) understand, but this will be used to
conduct posterior predictive checks. That is, we can simulate fake data
and check their characteristics (mean, sd, quantiles) to see if the
model does a good job. Morris uses leave-one-out cross validation, and
uses the output from the loo() function–Expected Log
Predictive Density (ELPD)–which gives the log-probability of new data
given the model. A higher ELPD is better.
design_mat <- as.data.frame(crashes_tracts_geo) %>%
select(prop_pubtransit, median_income, median_age, frag_index, traffic) %>%
mutate(median_income = log(median_income),
traffic = log(traffic))
pois_data <- list(
N = nrow(crashes_tracts_geo),
y = as.integer(crashes_tracts_geo$tot_crashes),
E = as.integer(crashes_tracts_geo$population),
K = ncol(design_mat),
xs = design_mat
)
poisson_model_1 <- cmdstan_model(poisson_model_file)
set.seed(50)
poisson_fit_1 <- poisson_model_1$sample(data=pois_data, parallel_chains=cores, refresh=0)
## Running MCMC with 4 chains, at most 6 in parallel...
##
## Chain 4 finished in 9.9 seconds.
## Chain 1 finished in 10.6 seconds.
## Chain 3 finished in 10.7 seconds.
## Chain 2 finished in 11.9 seconds.
##
## All 4 chains finished successfully.
## Mean chain execution time: 10.8 seconds.
## Total execution time: 12.1 seconds.
print(poisson_fit_1)
## variable mean median sd mad q5 q95 rhat ess_bulk
## lp__ 283546.16 283546.00 1.76 1.48 283543.00 283548.00 1.00 1598
## beta0 -6.41 -6.41 0.09 0.09 -6.56 -6.25 1.00 1065
## betas[1] -0.20 -0.20 0.03 0.02 -0.24 -0.16 1.00 1099
## betas[2] -0.03 -0.03 0.01 0.01 -0.04 -0.02 1.00 1248
## betas[3] -0.01 -0.01 0.00 0.00 -0.01 -0.01 1.00 4725
## betas[4] 0.00 0.00 0.00 0.00 0.00 0.01 1.00 2457
## betas[5] 0.26 0.26 0.00 0.00 0.26 0.27 1.00 1814
## y_rep[1] 29.84 30.00 5.46 5.93 21.00 39.00 1.00 3761
## y_rep[2] 54.56 55.00 7.29 7.41 43.00 67.00 1.00 3293
## y_rep[3] 102.82 103.00 10.03 10.38 86.00 120.00 1.00 4129
## ess_tail
## 2082
## 1509
## 1723
## 1831
## 3012
## 2464
## 2146
## 3700
## 3827
## 3909
##
## # showing 10 of 3919 rows (change via 'max_rows' argument or 'cmdstanr_max_rows' option)
Refinement: Mean-Center Predictor Data